# Add code here to load all the required libraries with `library()`.
# Do not include any `install.package()` for any required packages in this rmd file.
library(ggplot2)
library(readr)
library(VIM)
library(mice)
library(vcdExtra)
library(car)
library(tidyverse)
# Only change the value for SID
# Assign your student id into the variable SID, for example:
SID <- 2330859 # This is an example, replace 2101234 with your actual ID
SIDoffset <- (SID %% 50) + 1 # Your SID mod 50 + 1
load("car-analysis-data.Rda")
# Now subset the car data set
# Pick every 50th observation starting from your offset
# Put into your data frame named mydf (you can rename it)
mydf <- cars.analysis[seq(from=SIDoffset,to=nrow(cars.analysis),by=50),]
I will find any NA or implausible values. These are significant because analyses could be affected by missing data – i.e. a trend line on a graph may not represent the true nature of the data. Implausible values can create outliers which also affects the regression line in modelling, still, outliers are plausible therefore implausibility must be well justified. Appropriate imputation methods will be used for variables with NA values starting with the variable with the highest number of NA values. I will also check that any implausible value was not chosen by the data owners for arbitrary imputation by checking the frequency of the value compared to other values. In this case, I would explore further to understand what the value meant and better understand how to impute the value. To keep as much of the data as possible, I will impute unknown values where appropriate. I will also check the levels in the categorical variables to check for misspellings and correct erroneous levels’ names. I will then convert all variables apart from price and mileage into factors because unlike price and mileage they are either binary, categorical or fixed values such as brand.
# Create a new data frame to store the transformations
mydf.comp <- mydf
# Check for any NAs
no.NA <- sum(is.na(mydf.comp)) # Gives a sum of the total NAs in the data frame
paste("There are", no.NA,"NA values in the data. They can be found in the following variables:")
[1] "There are 124 NA values in the data. They can be found in the following variables:"
colSums(is.na(mydf.comp)) # Shows how many NAs are in each variable
brand year mileage engine_size automatic_transmission
0 0 0 19 0
fuel drivetrain min_mpg max_mpg damaged
0 0 49 49 2
first_owner navigation_system bluetooth third_row_seating heated_seats
5 0 0 0 0
price
0
aggr(mydf) # Visualizes the missing data in the data
Most NAs are in the mpg variables.
# Impute max_mpg's NA values using its median
median_max_mpg <- median(mydf.comp$max_mpg[mydf.comp$max_mpg > 0], na.rm = TRUE) # Find the median (excluding NA and negative values)
mydf.comp$max_mpg[is.na(mydf.comp$max_mpg)] = median_max_mpg # Replace NA values in max_mpg with the median
# Compare its pre-imputation distribution to its post
ggplot(data=mydf, aes(x=max_mpg)) + geom_histogram(bins = 10) + theme_linedraw() +ggtitle("Histogram of max_mpg without imputed median for NA values") + xlab("Max_mpg without imputed values")
ggplot(data=mydf.comp, aes(x=max_mpg)) + geom_histogram(bins = 10) + theme_linedraw() +ggtitle("Histogram of max_mpg with imputed median for NA values") + xlab("Max_mpg with imputed values")
This affects the distribution significantly. I will try Multiple Imputation by Chained Equations.
# Impute all NAs in the data using mice
mydf.mice <- mice(mydf)
iter imp variable
1 1 engine_size min_mpg max_mpg damaged first_owner
1 2 engine_size min_mpg max_mpg damaged first_owner
1 3 engine_size min_mpg max_mpg damaged first_owner
1 4 engine_size min_mpg max_mpg damaged first_owner
1 5 engine_size min_mpg max_mpg damaged first_owner
2 1 engine_size min_mpg max_mpg damaged first_owner
2 2 engine_size min_mpg max_mpg damaged first_owner
2 3 engine_size min_mpg max_mpg damaged first_owner
2 4 engine_size min_mpg max_mpg damaged first_owner
2 5 engine_size min_mpg max_mpg damaged first_owner
3 1 engine_size min_mpg max_mpg damaged first_owner
3 2 engine_size min_mpg max_mpg damaged first_owner
3 3 engine_size min_mpg max_mpg damaged first_owner
3 4 engine_size min_mpg max_mpg damaged first_owner
3 5 engine_size min_mpg max_mpg damaged first_owner
4 1 engine_size min_mpg max_mpg damaged first_owner
4 2 engine_size min_mpg max_mpg damaged first_owner
4 3 engine_size min_mpg max_mpg damaged first_owner
4 4 engine_size min_mpg max_mpg damaged first_owner
4 5 engine_size min_mpg max_mpg damaged first_owner
5 1 engine_size min_mpg max_mpg damaged first_owner
5 2 engine_size min_mpg max_mpg damaged first_owner
5 3 engine_size min_mpg max_mpg damaged first_owner
5 4 engine_size min_mpg max_mpg damaged first_owner
5 5 engine_size min_mpg max_mpg damaged first_owner
Warning: Number of logged events: 3
mydf.comp <- complete(mydf.mice)
ggplot(data=mydf, aes(x=max_mpg)) + geom_histogram(bins = 10) + theme_linedraw() +ggtitle("Histogram of max_mpg without imputed 'mice' for NA values") + xlab("Max_mpg without imputed values")
ggplot(data=mydf.comp, aes(x=max_mpg)) + geom_histogram(bins = 10) + theme_linedraw() +ggtitle("Histogram of max_mpg with imputed 'mice' for NA values") + xlab("Max_mpg with imputed values")
This method fits better.
# Statistically summarize the data
summary(mydf.comp)
brand year mileage engine_size automatic_transmission fuel
Length:410 Min. :1964 Min. : 0 Min. : 1.200 Min. :0.0000 Length:410
Class :character 1st Qu.:2015 1st Qu.: 22286 1st Qu.: 2.000 1st Qu.:1.0000 Class :character
Mode :character Median :2018 Median : 44394 Median : 2.450 Median :1.0000 Mode :character
Mean :2017 Mean : 51588 Mean : 3.689 Mean :0.9146
3rd Qu.:2021 3rd Qu.: 72747 3rd Qu.: 3.500 3rd Qu.:1.0000
Max. :2023 Max. :209968 Max. :390.000 Max. :1.0000
drivetrain min_mpg max_mpg damaged first_owner navigation_system
Length:410 Min. : 0.00 Min. :-30.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
Class :character 1st Qu.:18.00 1st Qu.: 24.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Mode :character Median :21.50 Median : 28.00 Median :0.0000 Median :0.0000 Median :0.0000
Mean :21.53 Mean : 28.25 Mean :0.2659 Mean :0.4902 Mean :0.4707
3rd Qu.:24.00 3rd Qu.: 32.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :47.00 Max. : 51.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
bluetooth third_row_seating heated_seats price
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 4500
1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:18988
Median :1.0000 Median :0.0000 Median :1.0000 Median :29370
Mean :0.8854 Mean :0.1537 Mean :0.5098 Mean :28944
3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:37998
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :54999
table(mydf.comp[mydf.comp < 0])
-30
2
Year, mileage and price values all seem plausible. Negative values in max_mpg are implausible. It may be representative. I will verify using its frequency in cars.analysis.
# Check if and how frequently -30 were used in cars.analysis
summary(cars.analysis)
brand year mileage engine_size automatic_transmission fuel
Length:20483 Min. :1953 Min. : 0 Min. : 0.000 Min. :0.0000 Length:20483
Class :character 1st Qu.:2016 1st Qu.: 23582 1st Qu.: 2.000 1st Qu.:1.0000 Class :character
Mode :character Median :2019 Median : 44709 Median : 2.400 Median :1.0000 Mode :character
Mean :2017 Mean : 52575 Mean : 2.821 Mean :0.9149
3rd Qu.:2020 3rd Qu.: 73894 3rd Qu.: 3.500 3rd Qu.:1.0000
Max. :2023 Max. :383614 Max. :390.000 Max. :1.0000
NA's :1114
drivetrain min_mpg max_mpg damaged first_owner navigation_system
Length:20483 Min. : 0.00 Min. :-30 Min. :0.0000 Min. :0.0000 Min. :0.0000
Class :character 1st Qu.:18.00 1st Qu.: 24 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Mode :character Median :21.00 Median : 28 Median :0.0000 Median :0.0000 Median :0.0000
Mean :21.29 Mean : 28 Mean :0.2412 Mean :0.4952 Mean :0.4485
3rd Qu.:24.00 3rd Qu.: 32 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :89.00 Max. :100 Max. :1.0000 Max. :1.0000 Max. :1.0000
NA's :2987 NA's :2970 NA's :204 NA's :308
bluetooth third_row_seating heated_seats price
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 1495
1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:18995
Median :1.0000 Median :0.0000 Median :0.0000 Median :27998
Mean :0.8789 Mean :0.1445 Mean :0.4678 Mean :28458
3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:37000
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :54999
table(cars.analysis[cars.analysis < 0])
-30.0
101
101 ‘-30’ observations suggest it’s a code.
# Check for other arbitrary imputations
table(cars.analysis$max_mpg)
-30 0 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 27.5 28 29 30
101 91 1 3 7 14 22 30 151 267 388 401 667 650 927 1125 1002 1052 1075 1 1452 1028 1333
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
1146 662 746 676 591 328 337 348 143 374 100 39 48 27 5 6 10 19 8 1 34 4 11
54 55 56 57 58 59 60 61 71 80 94 100
5 19 2 1 2 8 3 1 1 16 1 3
table(mydf.comp$max_mpg)
-30 0 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 43
2 4 1 5 5 12 7 16 11 25 20 10 33 25 45 14 30 25 20 12 20 18 9 6 7 2 15 4 3
47 48 51
1 1 2
0 mpg means the car is stationary. I will combine -30 to 0 as code for: cars that do not run.
# Convert negative observations in the max_mpg variable to 0
mydf.comp$max_mpg[mydf.comp$max_mpg < 0] = 0
table(mydf.comp$max_mpg) # Check that there are now eight 0 max_mpg values
0 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 43 47 48 51
6 1 5 5 12 7 16 11 25 20 10 33 25 45 14 30 25 20 12 20 18 9 6 7 2 15 4 3 1 1 2
A 390-litre car engine size is implausible.
# Find out if there are other values as large
table(mydf.comp$engine_size)
1.2 1.3 1.4 1.5 1.6 1.8 2 2.3 2.4 2.5 2.9 3 3.3 3.4 3.5 3.6 3.7 3.8 4 4.2 4.4 4.6 4.7 4.8 5 5.3 5.5 5.6 5.7
5 1 22 13 14 10 120 2 18 36 1 43 8 1 33 23 3 5 7 1 2 6 6 1 10 2 1 5 2
6 6.2 6.8 390
3 4 1 1
# Change 390 to 3.9
mydf.comp$engine_size[mydf.comp$engine_size==390] = 3.9
summary(mydf.comp$engine_size)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.200 2.000 2.450 2.748 3.500 6.800
# Identify the levels of 'fuel'
table(mydf.comp$fuel) # 'Pertol' is a misspelling
Diesel Electric GPL Hybrid Pertol Petrol Unknown
5 8 3 26 2 364 2
# Change the misspelled 'pertol' to the 'petrol' level
mydf.comp[mydf.comp$fuel == "Pertol", "fuel" ] = "Petrol"
# Impute 'Unknown' values using Frequent Category Imputation
mydf.comp$fuel[(mydf.comp$fuel=="Unknown")] <- names(which.max(table(mydf.comp$fuel)))
table(mydf.comp$fuel)
Diesel Electric GPL Hybrid Petrol
5 8 3 26 368
# Check other categorical variable levels for errors
table(mydf.comp$brand)
Alfa Audi BMW Cadillac Chevrolet FIAT Ford Honda
24 21 17 19 12 24 13 24
Hyundai Jaguar Jeep Kia Land Lexus Maserati Mazda
18 14 11 20 9 15 12 21
Mercedes-Benz MINI Mitsubishi Nissan Porsche Suzuki Toyota Volkswagen
15 21 24 13 9 2 10 21
Volvo
21
table(mydf.comp$drivetrain)
Four-wheel Drive Front-wheel Drive Rear-wheel Drive Unknown
225 119 64 2
# Impute 'Unknown' values using Frequent Category Imputation
mydf.comp$drivetrain[(mydf.comp$drivetrain=="Unknown")] <- names(which.max(table(mydf.comp$drivetrain)))
table(mydf.comp$drivetrain)
Four-wheel Drive Front-wheel Drive Rear-wheel Drive
227 119 64
Categorical and binary should be factors and numerical should be numerical for EDA.
# Check to see if variables have been read in correctly
str(mydf.comp)
'data.frame': 410 obs. of 16 variables:
$ brand : chr "Maserati" "Jeep" "Toyota" "Land" ...
$ year : num 2018 2021 2020 2015 2021 ...
$ mileage : num 19267 36433 35134 106110 48435 ...
$ engine_size : num 3 3.6 2.5 3 2.5 1.5 3 2 2 3.3 ...
$ automatic_transmission: num 1 1 1 1 1 1 1 1 1 1 ...
$ fuel : chr "Petrol" "Petrol" "Hybrid" "Petrol" ...
$ drivetrain : chr "Rear-wheel Drive" "Four-wheel Drive" "Four-wheel Drive" "Four-wheel Drive" ...
$ min_mpg : num 17 21 34 14 35 27 18 17 22 22 ...
$ max_mpg : num 24 28 35 19 36 36 23 23 28 29 ...
$ damaged : num 1 0 0 0 0 0 0 0 0 0 ...
$ first_owner : num 1 1 1 0 1 1 1 1 0 0 ...
$ navigation_system : num 1 0 1 0 1 0 1 0 1 0 ...
$ bluetooth : num 1 1 1 1 1 1 1 1 1 1 ...
$ third_row_seating : num 0 0 1 0 1 0 0 0 0 1 ...
$ heated_seats : num 1 0 1 0 1 1 0 0 1 0 ...
$ price : num 37699 47288 45698 19888 50995 ...
# Convert all variables into appropriate data types
columns <- names(mydf.comp)
mydf.comp[, columns] <- lapply(mydf.comp[,columns] , factor) # Converts all columns into a factor
mydf.comp$price <-as.numeric(mydf$price) # Convert numeric vectors from factor to numeric
mydf.comp$mileage <- as.numeric(mydf$mileage)
mydf.comp$engine_size <- as.character(mydf.comp$engine_size)
mydf.comp$engine_size <- as.numeric(mydf.comp$engine_size)
mydf.comp$max_mpg <- as.character(mydf.comp$max_mpg)
mydf.comp$max_mpg <- as.numeric(mydf.comp$max_mpg)
mydf.comp$min_mpg <- as.character(mydf.comp$min_mpg)
mydf.comp$min_mpg <- as.numeric(mydf.comp$min_mpg)
mydf.comp$year <- as.character(mydf.comp$year)
mydf.comp$year <- as.numeric(mydf.comp$year)
str(mydf.comp) # Shows that each variable has been correctly read in
'data.frame': 410 obs. of 16 variables:
$ brand : Factor w/ 25 levels "Alfa","Audi",..: 15 11 23 13 23 18 13 24 25 9 ...
$ year : num 2018 2021 2020 2015 2021 ...
$ mileage : num 19267 36433 35134 106110 48435 ...
$ engine_size : num 3 3.6 2.5 3 2.5 1.5 3 2 2 3.3 ...
$ automatic_transmission: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
$ fuel : Factor w/ 5 levels "Diesel","Electric",..: 5 5 4 5 4 5 5 1 4 5 ...
$ drivetrain : Factor w/ 3 levels "Four-wheel Drive",..: 3 1 1 1 1 2 1 2 1 1 ...
$ min_mpg : num 17 21 34 14 35 27 18 17 22 22 ...
$ max_mpg : num 24 28 35 19 36 36 23 23 28 29 ...
$ damaged : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
$ first_owner : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
$ navigation_system : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
$ bluetooth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
$ third_row_seating : Factor w/ 2 levels "0","1": 1 1 2 1 2 1 1 1 1 2 ...
$ heated_seats : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 1 2 1 ...
$ price : num 37699 47288 45698 19888 50995 ...
There were 124 NAs which can bias analysis outcomes. After visualizing the effect of median imputation and Multiple Imputation by Chained Equations, the latter was chosen because it better matched the original distribution. There were no errors found in year, price or mileage because all seemed plausible and there was no missingness.
There was an implausible value (-30) in the max_mpg variable which I cross-referenced with the cars.analysis data to find 101 negative values, all -30. This suggests it’s an arbitrary imputation code perhaps suggesting the car is being sold for parts. Another code (0) was found and combined with -30 to represent malfunctioning cars. The reason is that 0 mpg means the car is stationary therefore cannot work, and thus is sold for parts.
A 390-litre engine size is implausible and was likely supposed to be 3.9, which is more in line with the variable mean of 3.7. I amended two misspellings in ‘fuel’ and imputed unknown observations using mode imputation in ‘fuel’ and ‘drivetrain’.
Finally, R read the binary variables incorrectly, so I converted all variables into factors except for the numerical variables.
I will start by visualizing the relationship between mileage (numerical) and price (numerical) using a scatterplot because intuitively I know that generally, cars with more miles are cheaper to buy. If the graph suggests correlation I will justify it statistically using Fisher’s p test and the correlation coefficient, before running a linear regression model with both variables and checking that the Residuals and Q-Q plot look random and normal respectively. I will do the same for ‘year’ for the same reason. I will then visualize the binary variable relationships to price using boxplots. Finally, I will visualize the mpg variables using scatter plots as they are both numerical variables. To avoid heteroskedasticity later, I will plot histograms to compare and visualize multicollinearity. If there is any, I will transform the data using the natural logarithm.
# Visualize the relationship between mileage and price
ggplot(data=mydf.comp, aes(x=mileage, y=price)) + geom_point() + theme_linedraw() + ggtitle("Scatter plot of the relationship between price and mileage") + xlab("Price") + ylab("Mileage")
This scatter plot shows a negative relationship between price and mileage - the more mileage it has, the cheaper it is.
# Numerically justify the graph
cor.test(mydf.comp$mileage,mydf.comp$price)
Pearson's product-moment correlation
data: mydf.comp$mileage and mydf.comp$price
t = -16.225, df = 408, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.6817493 -0.5635850
sample estimates:
cor
-0.6262505
The correlation value is closer to -1 which means the higher the X-axis value (Mileage), the lower the Y-axis value (Price). I can use a hypothesis test to confirm my finding: H_0:Price & Mileage are not correlated & H_1:Price & Mileage are corelated where \(\alpha=0.05\).
The p-value is: 2.2e-16 which is a lot smaller than alpha which means I have enough evidence to reject the null hypothesis. Price and mileage are correlated.
# Create a linear regression model of price and mileage
lm.price.mileage <- lm(formula = mydf.comp$price~mydf.comp$mileage)
summary(lm.price.mileage)
Call:
lm(formula = mydf.comp$price ~ mydf.comp$mileage)
Residuals:
Min 1Q Median 3Q Max
-27062.1 -7243.4 -560.3 6438.2 27026.3
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.902e+04 7.733e+02 50.46 <2e-16 ***
mydf.comp$mileage -1.953e-01 1.204e-02 -16.23 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9332 on 408 degrees of freedom
Multiple R-squared: 0.3922, Adjusted R-squared: 0.3907
F-statistic: 263.3 on 1 and 408 DF, p-value: < 2.2e-16
plot(lm.price.mileage)
The R^2 however is smaller than 0.5 suggesting it is not significant. Still the p-value suggests they are dependent and the intercept suggests that for every unit change in mileage, price decreases by 2.008e+00. Visually, the Residual Vs Fitted looks like a funnel suggesting hetroskedasticity.
# Create a new variable for the log of 'price'
mydf.comp$price.log <- log(mydf.comp$price)
# Run the model again
lm.price.log <- lm(formula = mydf.comp$price.log~mydf.comp$mileage)
summary(lm.price.log)
Call:
lm(formula = mydf.comp$price.log ~ mydf.comp$mileage)
Residuals:
Min 1Q Median 3Q Max
-1.22535 -0.23502 0.02798 0.26199 1.06008
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.061e+01 3.027e-02 350.62 <2e-16 ***
mydf.comp$mileage -8.657e-06 4.712e-07 -18.37 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3653 on 408 degrees of freedom
Multiple R-squared: 0.4527, Adjusted R-squared: 0.4514
F-statistic: 337.5 on 1 and 408 DF, p-value: < 2.2e-16
plot(lm.price.log)
The model is still heteroskedastic.
# Visualize the relationship between year and price
ggplot(data=mydf.comp, aes(x=year, y=price)) + geom_point() + theme_classic() + ggtitle("Scatter plot of price vs year") + xlab("Year") + ylab("Price") + theme(axis.text=element_text(size=7))
This scatter graph shows a positive relationship - the newer the car the more expensive it is.
# Numerically justify the graph
cor.test(mydf.comp$year,mydf.comp$price)
Pearson's product-moment correlation
data: mydf.comp$year and mydf.comp$price
t = 11.894, df = 408, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4317940 0.5759648
sample estimates:
cor
0.5074219
This correlation value is closer to 1 suggesting the more recent the year, the more expensive. Using the following hypotheses \(H_0:Price & Mileage are not correlated\) & \(H_1:Price & Mileage are corelated\) where \(\alpha=0.05\) the null hypothesis is very small so I can rject the null hypothesis. Price and year are dependent.
# Create a linear regression model of price and year
lm.price.year <- lm(formula = mydf.comp$price~mydf.comp$year)
summary(lm.price.year)
Call:
lm(formula = mydf.comp$price ~ mydf.comp$year)
Residuals:
Min 1Q Median 3Q Max
-19625 -8413 -545 7247 56495
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.278e+06 1.939e+05 -11.74 <2e-16 ***
mydf.comp$year 1.144e+03 9.614e+01 11.89 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10310 on 408 degrees of freedom
Multiple R-squared: 0.2575, Adjusted R-squared: 0.2557
F-statistic: 141.5 on 1 and 408 DF, p-value: < 2.2e-16
plot(lm.price.year)
The r squared value is 26% - not statistically significant, however the intercepts suggest they are very significant because for every unit increase in year, the price reduces by 2.252e-04. The low r-squared could suggest lm is not a good model. Moreover, the Residual vs Fitted plot does not look random and has funnel shape which suggests there is heteroskedasticity.
# Run the model again
lm.year.log <- lm(formula = mydf.comp$price.log~mydf.comp$year)
summary(lm.year.log)
Call:
lm(formula = mydf.comp$price.log ~ mydf.comp$year)
Residuals:
Min 1Q Median 3Q Max
-1.27462 -0.26062 0.05506 0.28794 2.55329
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -88.81052 7.88727 -11.26 <2e-16 ***
mydf.comp$year 0.04906 0.00391 12.55 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4195 on 408 degrees of freedom
Multiple R-squared: 0.2785, Adjusted R-squared: 0.2767
F-statistic: 157.5 on 1 and 408 DF, p-value: < 2.2e-16
plot(lm.year.log)
The model is still heteroskedastic.
# Visualize the relationship between the transmission and price
ggplot(mydf.comp, aes(x=automatic_transmission, y=price)) + geom_boxplot() + theme_minimal() + ggtitle("Transmission type vs Price") + ylab("Price") + xlab("Transmission type (0=Not automatic, 1= Automatic)")
Cars with automatic transmission generally sell at a higher price for a median of 30,000.
# Visualize the relationship between damaged and price
ggplot(mydf.comp, aes(x=damaged, y=price)) + geom_boxplot() + theme_bw() + ggtitle("Price vs damaged") + xlab("Damaged (0=Not damaged, 1= Damaged") + ylab("Price")
Damaged cars are sold around 10,000 cheaper
# Visualize the relationship between the seller type and price
ggplot(mydf.comp, aes(x=first_owner, y=price)) + geom_boxplot() + theme_linedraw() + ggtitle("First owner vs Price") + ylab("Price") + xlab("First owner (0=No, 1=Yes)")
This boxplot shows that cars with only one owner sell at a higher price than those with multiple owners.
# Visualize the relationship between the drivetrain and price
ggplot(mydf.comp, aes(x=drivetrain, y=price)) + geom_boxplot() + theme_minimal() + ggtitle("Drivetrain type vs Price") + ylab("Price") + xlab("Drivetrain type")
Four wheel drive is more expensive followed (with a lot of overlap) by rear wheel drive – although it has a lower mean
# Visualize the relationship between the fuel type and price
ggplot(mydf.comp, aes(x=fuel, y=price)) + geom_boxplot() + theme_minimal() + ggtitle("Fuel type vs Price") + ylab("Price") + xlab("Fuel type")
Electric cars seem to be the most expensive cars while GPL cars are the cheapest. Most of the overlap is between cheaper hybrids and expensive petrol cars.
# Visualize the relationship between max_mpg and price
ggplot(data=mydf.comp, aes(x=max_mpg, y=price)) + geom_point() + theme_classic() + ggtitle("Scatter plot of price vs maximum miles per gallon (mpg)") + xlab("Maximum mpg") + ylab("Price") + geom_smooth(method = "lm", se = FALSE)
Though there is no particular trend, it could be argued there is a downward trend where price decreases as maximum mpg increases.
# Visualize the relationship between min_mpg and price
ggplot(data=mydf.comp, aes(x=min_mpg, y=price)) + geom_point() + theme_classic() + ggtitle("Scatter plot of price vs minimum miles per gallon (mpg)") + xlab("Minimum mpg") + ylab("Price") + geom_smooth(method = "lm", se = FALSE)
The same argument can be made that the larger the minimum mpg, the less expensive the car.
# Visualize the relationship between heated seats, price and third row seating
ggplot(mydf.comp, aes(x=heated_seats, y=price, fill=third_row_seating)) + geom_boxplot() + theme_bw() + ggtitle("Price vs heated seats and third row seating") + xlab("Heated seats") + ylab("Price")
Third row seating increases the price, there’s little difference between heated third row seating but a large difference between no third row or heated seats and unheated third seats.
# Visualize the relationship between bluetooth, price and navigation
ggplot(mydf.comp, aes(x=bluetooth, y=price, fill=navigation_system)) + geom_boxplot() + theme_bw() + ggtitle("Price vs bluetooth and navigation sysetm") + xlab("Bluetooth") + ylab("Price")
Bluetooth increases cost but the combination of no bluetooth and a navigation system is more expensive that both bluetooth and navigation.
# Visualize the relationship between engine_size and price
ggplot(data=mydf.comp, aes(x=engine_size, y=price)) + geom_point() + theme_classic() + ggtitle("Scatter plot of price vs engine_size") + xlab("Engine size") + ylab("Price")
Larger engine sizes are generally more expensive, but there are a lot of expensive cars with smaller engines. 2 litre engines seem the most popular.
# Visualize the relationship between brand and price
ggplot(data=mydf.comp, aes(x=brand, y=price)) + geom_point() + theme_classic() + ggtitle("Scatter plot of price vs brand") + xlab("brand") + ylab("Price") + theme(axis.text=element_text(size=3.7))
Although most brands operate within the same price, some brand cater mid-to-high prices(Alfa, Porsche, Maserati), while others are more budget friendly (FIAT, Mitsubushi, Mazda)
# Visualize 'brand' frequencies
table(mydf.comp$brand) # To find frequencies
Alfa Audi BMW Cadillac Chevrolet FIAT Ford Honda
24 21 17 19 12 24 13 24
Hyundai Jaguar Jeep Kia Land Lexus Maserati Mazda
18 14 11 20 9 15 12 21
Mercedes-Benz MINI Mitsubishi Nissan Porsche Suzuki Toyota Volkswagen
15 21 24 13 9 2 10 21
Volvo
21
brand.count <- c(24,21,17,19,12,24,13,24,18,14,11,20,9,15,12,21,15,21,24,13,9,2,10,21,21)
brand.name <- c("Alfa", "Audi", "BMW", "Cadillac", "Chevrolet", "FIAT", "Ford", "Honda", "Hyundai", "Jaguar", "Jeep", "Kia", "Land","Lexus","Maserati","Mazda","Mercedes-Benz","MINI","Mitsubishi","Nissan","Porsche","Suzuki","Toyota","Volkswagen","Volvo")
barplot(brand.count, main="Bar chart of brand frequency in 'mydf.comp'", ylab="Frequency in 'mydf.comp'", names.arg=brand.name, cex.names = 0.7, col="steelblue", density=c(100, 50, 50, 50, 50, 100, 50, 100, 50, 50, 50, 50, 50,50,50,50,50,50,100,50,50,50,50,50,50))
The first scatter plot shows a negative relationship between price and mileage - the more mileage a car has, the cheaper it is. The correlation value was closer to -1 which means the higher the X-axis value (Mileage), the lower the Y-axis value (Price).
If \(H_0:Price & Mileage are not correlated\) & \(H_1:Price & Mileage are corelated\) where \(\alpha=0.05\).
The p-value is: 2.2e-16 which is a lot smaller than the alpha which means I have enough evidence to reject the null hypothesis. Price and mileage are correlated.
The R^2 > 0.5 suggests it is not significant. Still, the p-value suggests they are dependent and the intercept suggests that for every unit change in mileage, the price decreases by 2.008e+00. Visually, the Residual vs. fitted looks like a funnel suggesting heteroskedasticity. The scatterplot suggests a positive relationship - the newer the car the more expensive it is.
This correlation value is closer to 1 suggesting the more recent, the more expensive. Where hypotheses \(H_0:Price & Mileage are not correlated\) & \(H_1:Price & Mileage are corelated\) where \(\alpha=0.05\) the null hypothesis is very small so I can reject the null hypothesis. Price and year are dependent.
This correlation value is closer to 1 suggesting the more recent the year, the more expensive. Using the following hypotheses \(H_0: not correlated\) & \(H_1:Price & Mileage are corelated\) where \(\alpha=0.05\) the null hypothesis is very small so I can reject the null hypothesis. Price and year are dependent.
The model is still heteroskedastic.
Cars with automatic transmissions generally sell at a higher price for a median of 30,000. Damaged cars are sold around 10,000 cheaper. This boxplot shows that cars with only one owner sell at a higher price than those with multiple owners. Four wheel drive is more expensive followed (with a lot of overlap) by rear wheel drive – although it has a lower mean
Electric cars seem to be the most expensive cars while GPL cars are the cheapest. Most of the overlap is between cheaper hybrids and expensive petrol cars. Though there is no particular trend, it could be argued there is a downward trend where price decreases as maximum mpg increases.
The same argument can be made that the larger the minimum mpg, the less expensive the car. Third-row seating increases the price. Bluetooth increases cost but the combination of no Bluetooth and a navigation system is more expensive than both Bluetooth and navigation.
Larger engine sizes are generally more expensive, but there are a lot of expensive cars with smaller engines. 2-litre engines seem the most popular.
Year and mileage seem to have linear relationships with price (dependent), however they are heteroscedastic and despite transforming the dependent variable using the natural logarithm, the linear regression was still heteroscedastic. To model price as the numeric dependent along with a combination of numeric, categorical and binary independent variables, I will use the linear model function. This will provide Fisher’s p-value which I can compare to my hypothesis and alpha. If larger than alpha, the p-value will confirm the statistical significance of the model. The output will also give coefficients which indicate if they are significant, denoted by ‘’,’’,’’ and how they react in response to the target variable. My approach will be to use Analysis of Covariance (ANCOVA) because of the mixed independent and numeric dependent variables. I will create a maximal multiple linear regression model using the lm() function. I will then use the step() function to find the minimum adequate model. Then I will use the summary() function on the minimum adequate model to understand the significance of each variable. I will also create a logistic regression model and check which explains the price more effectively.
# Build a maximal multiple linear regression model with price as the dependent
maximal.mlr <- lm(mydf.comp$price~.,data = mydf.comp)
summary(maximal.mlr)
# Use the step() function to get the minimum adequate model
model.1 <- step(maximal.mlr)
summary(model.1)
plot(model.1)
# Check for variance inflation factor
vif(maximal.mlr)
vif(model.1)
# Logistic regression
lm.model <- lm(log(mydf.comp$price)~., data = mydf.comp)
summary(lm.model)
step(lm.model)
Fisher’s p-value is smaller than \(\alpha\) which means that the variables in this model affect price, in other words, the price is affected by these variables. The variables that most affect price are brand, where some brands such as Suzuki, Volvo, Maserati, Mercedes-Benz, and FIAT. The price will increase by 1.528e+03 if the car has a navigation system and will decrease by -3.024e+03 if it has Bluetooth. For every price reduction by -4.335e+05 the year will increase by 1.124e+02, while for every price reduction, the minimum mpg increases by 1.337e+02. Akaike’s information criterion has established that the minimum adequate model is: mydf.comp$price ~ brand + year + mileage + drivetrain + min_mpg + max_mpg + navigation_system + bluetooth + heated_seats + price.log
multicollinearity was found in the logistic regression model, so the minimum adequate model’s plots(Residual vs fitting) have a funnel shape suggesting heteroscedasticity while the Q-Q plot looks normal with a slight ‘s’ shape forming due to outliers
Heteroscedasticity can be resolved by transforming the dependent variable using the natural logarithm. One of the principles of conventional linear regression is that constant variance of residuals is broken by heteroscedasticity. Another transformation could be the box cox transformation which involves stabilising non-constant variance and non-normality.
I propose model.1 as the model to predict pricing because Fisher’s p-value makes it statistically significant. It has also been transformed into a minimal adequate model from a maximal model.
# Visualize first owner and price
ggplot(mydf.comp, aes(x=first_owner, y=price)) + geom_boxplot() + theme_linedraw() + ggtitle("First owner vs Price") + ylab("Price") + xlab("First owner (0=No, 1=Yes)")
# Visualize first owner and year
ggplot(mydf.comp, aes(x=first_owner, y=mileage)) + geom_boxplot() + theme_linedraw() + ggtitle("First owner vs Mileage") + ylab("Mileage") + xlab("First owner (0=No, 1=Yes)")
To account model the probability of a binary being true I must use a generalized linear model. I will use all variables and then use the stepwise function to find the minimum adequate model. In order to measure the probability of a car being sold by its first owner I must transform the target variable so that it P=first owner can surpass 1. R automatically calculates the logit. I will then find the odds ratios to better understand the coefficients and explain the most significant variables and their impact on first_owner=1.
# Maximal generalized linear model
model.2 <- glm(mydf.comp$first_owner ~ mydf.comp$price + mydf.comp$brand + mydf.comp$year + mydf.comp$mileage + mydf.comp$engine_size + mydf.comp$automatic_transmission + mydf.comp$fuel + mydf.comp$drivetrain + mydf.comp$min_mpg + mydf.comp$max_mpg + mydf.comp$damaged + mydf.comp$navigation_system + mydf.comp$bluetooth + mydf.comp$third_row_seating + mydf.comp$heated_seats, data = mydf.comp, family = "binomial")
summary(model.2)
# Find the minimum adequate model
step(model.2)
# Create the minimal adequate model from step
model.3 <- glm(formula = mydf.comp$first_owner ~ mydf.comp$year + mydf.comp$mileage +
mydf.comp$engine_size + mydf.comp$automatic_transmission +
mydf.comp$min_mpg + mydf.comp$max_mpg + mydf.comp$bluetooth +
mydf.comp$third_row_seating + mydf.comp$heated_seats, family = "binomial",
data = mydf.comp)
# Summarize significant coefficients
summary(model.2)
Call:
glm(formula = mydf.comp$first_owner ~ mydf.comp$price + mydf.comp$brand +
mydf.comp$year + mydf.comp$mileage + mydf.comp$engine_size +
mydf.comp$automatic_transmission + mydf.comp$fuel + mydf.comp$drivetrain +
mydf.comp$min_mpg + mydf.comp$max_mpg + mydf.comp$damaged +
mydf.comp$navigation_system + mydf.comp$bluetooth + mydf.comp$third_row_seating +
mydf.comp$heated_seats, family = "binomial", data = mydf.comp)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.774e+02 1.537e+02 -3.758 0.000172 ***
mydf.comp$price -1.324e-05 2.886e-05 -0.459 0.646406
mydf.comp$brandAudi 1.052e+00 7.728e-01 1.362 0.173236
mydf.comp$brandBMW 4.872e-01 8.353e-01 0.583 0.559751
mydf.comp$brandCadillac 2.685e-01 8.091e-01 0.332 0.739963
mydf.comp$brandChevrolet 1.251e+00 1.212e+00 1.032 0.301907
mydf.comp$brandFIAT 9.916e-01 7.952e-01 1.247 0.212406
mydf.comp$brandFord 4.050e+00 1.330e+00 3.046 0.002322 **
mydf.comp$brandHonda 1.157e+00 8.214e-01 1.409 0.158913
mydf.comp$brandHyundai 2.924e-01 8.212e-01 0.356 0.721824
mydf.comp$brandJaguar 2.238e+00 1.001e+00 2.237 0.025316 *
mydf.comp$brandJeep 1.830e+00 1.078e+00 1.698 0.089506 .
mydf.comp$brandKia 9.998e-02 7.819e-01 0.128 0.898254
mydf.comp$brandLand 7.259e-01 1.121e+00 0.648 0.517134
mydf.comp$brandLexus 6.064e-01 8.645e-01 0.701 0.483019
mydf.comp$brandMaserati -1.062e+00 1.231e+00 -0.863 0.388045
mydf.comp$brandMazda 1.425e+00 8.519e-01 1.673 0.094408 .
mydf.comp$brandMercedes-Benz 1.282e+00 9.381e-01 1.367 0.171592
mydf.comp$brandMINI 9.684e-01 8.397e-01 1.153 0.248810
mydf.comp$brandMitsubishi 6.715e-01 8.062e-01 0.833 0.404886
mydf.comp$brandNissan 4.970e-01 1.003e+00 0.496 0.620084
mydf.comp$brandPorsche 6.171e-01 1.065e+00 0.579 0.562480
mydf.comp$brandSuzuki -1.280e+01 1.664e+03 -0.008 0.993861
mydf.comp$brandToyota 9.743e-01 1.127e+00 0.865 0.387281
mydf.comp$brandVolkswagen -3.929e-01 7.833e-01 -0.502 0.615936
mydf.comp$brandVolvo 8.588e-01 7.943e-01 1.081 0.279580
mydf.comp$year 2.884e-01 7.650e-02 3.769 0.000164 ***
mydf.comp$mileage -2.110e-05 6.904e-06 -3.056 0.002245 **
mydf.comp$engine_size -3.493e-01 2.256e-01 -1.548 0.121553
mydf.comp$automatic_transmission1 1.383e+00 6.703e-01 2.063 0.039099 *
mydf.comp$fuelElectric -7.936e-01 1.604e+00 -0.495 0.620677
mydf.comp$fuelGPL -1.847e+01 1.194e+03 -0.015 0.987656
mydf.comp$fuelHybrid -9.734e-01 1.230e+00 -0.791 0.428723
mydf.comp$fuelPetrol -1.303e+00 1.086e+00 -1.200 0.230197
mydf.comp$drivetrainFront-wheel Drive 7.243e-02 4.055e-01 0.179 0.858213
mydf.comp$drivetrainRear-wheel Drive -4.419e-01 4.727e-01 -0.935 0.349878
mydf.comp$min_mpg 2.132e-01 8.007e-02 2.663 0.007738 **
mydf.comp$max_mpg -2.053e-01 7.468e-02 -2.750 0.005969 **
mydf.comp$damaged1 -1.072e-01 3.226e-01 -0.332 0.739544
mydf.comp$navigation_system1 7.963e-02 3.537e-01 0.225 0.821887
mydf.comp$bluetooth1 -1.489e+00 6.670e-01 -2.232 0.025591 *
mydf.comp$third_row_seating1 7.729e-01 4.425e-01 1.746 0.080743 .
mydf.comp$heated_seats1 -7.676e-01 3.253e-01 -2.360 0.018291 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 568.22 on 409 degrees of freedom
Residual deviance: 358.54 on 367 degrees of freedom
AIC: 444.54
Number of Fisher Scoring iterations: 15
# Summarize significant coefficients
summary(model.3)
Call:
glm(formula = mydf.comp$first_owner ~ mydf.comp$year + mydf.comp$mileage +
mydf.comp$engine_size + mydf.comp$automatic_transmission +
mydf.comp$min_mpg + mydf.comp$max_mpg + mydf.comp$bluetooth +
mydf.comp$third_row_seating + mydf.comp$heated_seats, family = "binomial",
data = mydf.comp)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.129e+02 1.158e+02 -4.428 9.51e-06 ***
mydf.comp$year 2.558e-01 5.742e-02 4.455 8.40e-06 ***
mydf.comp$mileage -1.814e-05 5.366e-06 -3.380 0.000724 ***
mydf.comp$engine_size -3.375e-01 1.532e-01 -2.203 0.027622 *
mydf.comp$automatic_transmission1 8.562e-01 5.862e-01 1.461 0.144131
mydf.comp$min_mpg 2.168e-01 6.623e-02 3.273 0.001064 **
mydf.comp$max_mpg -1.990e-01 6.125e-02 -3.250 0.001155 **
mydf.comp$bluetooth1 -1.105e+00 5.811e-01 -1.901 0.057283 .
mydf.comp$third_row_seating1 8.771e-01 3.829e-01 2.290 0.021997 *
mydf.comp$heated_seats1 -8.547e-01 2.746e-01 -3.113 0.001854 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 568.22 on 409 degrees of freedom
Residual deviance: 392.45 on 400 degrees of freedom
AIC: 412.45
Number of Fisher Scoring iterations: 5
# Find odds ratio to interpret coefficients
exp(coef(model.3))
(Intercept) mydf.comp$year mydf.comp$mileage
1.857854e-223 1.291497e+00 9.999819e-01
mydf.comp$engine_size mydf.comp$automatic_transmission1 mydf.comp$min_mpg
7.135350e-01 2.354160e+00 1.242067e+00
mydf.comp$max_mpg mydf.comp$bluetooth1 mydf.comp$third_row_seating1
8.195105e-01 3.312720e-01 2.403891e+00
mydf.comp$heated_seats1
4.254319e-01
For every one unit increase in the coefficient, the odds of being the first owner decrease (if the value is below 1) or increase (if it is above 2). For example, For every one unit increase in the brand being volkwagen, the odds of selling the car as a first owner increases by 7.166632e-01.
The likelihood of being a first_owner (versus not being the first_owner) increase by a factor of 2.382e-0.1 for every year. The log odd’s of first_owner must be linear with the other independent variables to stabalise non constant variation and non-normal data. The model has undergone a maximal model and then used Akaike’s information criterion to create the minimal adequate model, which show how important year, mileage and heated seats are to selling as a first owner.